pandas
pip install pandas
plotly
pip install plotly
pycountry - for getting country codes for geoPlot
pip install pycountry
geopandas - for working with geo plots
pip install geopandas
# imports
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import os
import pycountry
# import geopandas as gpd
import plotly.io as pio
pio.renderers.default = "notebook+pdf"
The university dataset has many .csv files.
For this experiment we will be exploring only the cwurData.csv and timesData.csv
def loadCSVData(path):
'''
Function helps load the given path name if it is a CSV file into a DataFrame using pandas
'''
if os.path.splitext(path)[1] == ".csv": # read only csv files from the dataset
df = pd.read_csv(path, delimiter=',')
return df
return None
# loading timesData.csv
timesUniData = loadCSVData("../world_university_ranking/timesData.csv")
# info on columns
print(timesUniData.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2603 entries, 0 to 2602 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 world_rank 2603 non-null object 1 university_name 2603 non-null object 2 country 2603 non-null object 3 teaching 2603 non-null float64 4 international 2603 non-null object 5 research 2603 non-null float64 6 citations 2603 non-null float64 7 income 2603 non-null object 8 total_score 2603 non-null object 9 num_students 2544 non-null object 10 student_staff_ratio 2544 non-null float64 11 international_students 2536 non-null object 12 female_male_ratio 2370 non-null object 13 year 2603 non-null int64 dtypes: float64(4), int64(1), object(9) memory usage: 284.8+ KB None
timesUniData.isnull().sum()
world_rank 0 university_name 0 country 0 teaching 0 international 0 research 0 citations 0 income 0 total_score 0 num_students 59 student_staff_ratio 59 international_students 67 female_male_ratio 233 year 0 dtype: int64
before = timesUniData.shape[0]
timesUniData = timesUniData.dropna()
print(f"Dropped {before - timesUniData.shape[0]} NaN values")
print(timesUniData.isnull().sum())
Dropped 241 NaN values world_rank 0 university_name 0 country 0 teaching 0 international 0 research 0 citations 0 income 0 total_score 0 num_students 0 student_staff_ratio 0 international_students 0 female_male_ratio 0 year 0 dtype: int64
def parseNumberOfStudents(df : pd.Series) -> list[float]:
if df.dtype == object:
tmp = [item.replace(',', '') for ind, item in df.iteritems() ]
return list(map(float, tmp))
else:
return list(df)
def parseGenderRatio(df : pd.Series) -> list[float]:
lst = []
ratio = 0
if df.dtype == object:
for ind, item in df.iteritems():
try:
tmp = list(map(int, item.split(":") ) )
lst.append(tmp[0])
except ZeroDivisionError:
lst.append(100)
except:
lst.append(0) # no data
return lst
else:
return list(df)
# international student is given in the format 27% so we can just remove the % to
# convert it into a numeric data type
def parseInternationalStudents(df : pd.Series) -> list[float]:
if df.dtype == object:
tmp = [item.replace('%', '') for ind, item in df.iteritems() ]
return list(map(float, tmp))
else:
return list(df)
# here since number of students and gender ratio is given in strings we need to
# convert them into numeric types
num_stud = parseNumberOfStudents(timesUniData.num_students)
timesUniData.num_students = num_stud
pd.to_numeric(timesUniData.num_students)
gender_ratio = parseGenderRatio(timesUniData.female_male_ratio)
timesUniData.female_male_ratio = gender_ratio
pd.to_numeric(timesUniData.female_male_ratio)
international_students = parseInternationalStudents(
timesUniData.international_students)
timesUniData.international_students = international_students
pd.to_numeric(timesUniData.international_students)
print(timesUniData.info())
<class 'pandas.core.frame.DataFrame'> Int64Index: 2362 entries, 1 to 2602 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 world_rank 2362 non-null object 1 university_name 2362 non-null object 2 country 2362 non-null object 3 teaching 2362 non-null float64 4 international 2362 non-null object 5 research 2362 non-null float64 6 citations 2362 non-null float64 7 income 2362 non-null object 8 total_score 2362 non-null object 9 num_students 2362 non-null float64 10 student_staff_ratio 2362 non-null float64 11 international_students 2362 non-null float64 12 female_male_ratio 2362 non-null int64 13 year 2362 non-null int64 dtypes: float64(6), int64(2), object(6) memory usage: 276.8+ KB None
# plot 1
# Student staff ratio in the year 2015-2016
year = [2015, 2016]
uniData = timesUniData.query(f"year in {year}")
uni2015 = timesUniData[timesUniData["year"] == 2015]
uni2016 = timesUniData[timesUniData["year"] == 2016]
hist1 = go.Histogram(x = uni2015.student_staff_ratio, name='2015',
marker = dict(color = 'rgba(119, 157, 230, 0.8)' ) )
hist2 = go.Histogram(x = uni2016.student_staff_ratio, name = '2016',
marker = dict(color = 'rgba(220, 112, 92, 0.87)' ) )
layout = dict(title = 'Student staff ratio in year 2015-2016', title_x = 0.5, barmode='overlay',
xaxis_title = 'Count', yaxis_title = 'Student-staff ratio')
fig = go.Figure(data = [hist1, hist2], layout = layout)
fig.update_layout(title = 'Student staff ratio in year 2015-2016', title_x = 0.3, barmode='overlay',
xaxis_title = 'Count', yaxis_title = 'Student-staff ratio', font_size = 16)
fig.update_traces(opacity = .5)
# fig.show(renderer='browser', auto_open=True)
fig.show()
# plot 2
# compare the teaching with world ranking for the top 50 universities
year = 2016
num_uni = 30
topUnis = timesUniData[timesUniData.year == year].iloc[: num_uni]
colors = [float(item) for item in topUnis.international]
num_students = topUnis.num_students
data = go.Scatter(x = topUnis.world_rank, y = topUnis.teaching,
mode = 'markers+text',
marker = dict(
color = colors,
size = num_students,
sizeref= (5.0 * max(num_students) ) / (25.**2),
showscale = True
),
text = topUnis.university_name,
textfont=dict(
family="sans serif",
size=12,
color='#2e2e2d'
))
layout = dict(xaxis_title = 'World ranking', yaxis_title='Teaching score', legend_title ='International score',
title = f"Worlds top {num_uni} univeristy with number of students(size) and international score(scale) - {year}",
font = dict(
family="Calibri",
size=16,
color="RebeccaPurple"
))
fig = go.Figure(data = data, layout = layout)
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
# fig.show(renderer='browser', auto_open=True)
fig.show()
# for geo plots we need the iso alpha country
# so we need to get country codes
jsonUrl = 'https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.json'
countyDf = pd.read_json(jsonUrl)
countryCodes = dict(zip(countyDf.name, countyDf["alpha-3"]))
# since some countries dont have regional codes added like
# UK, South Korea, Hong Kong, Taiwan, Czech Republic, Iran etc.
# we add them manually
countryCodes['United Kingdom'] = 'GBR'
countryCodes['Hong Kong'] = 'HKG'
countryCodes["South Korea"] = 'KOR'
countryCodes['Republic of Ireland'] = 'IRL'
countryCodes['Taiwan'] = 'TWN'
countryCodes['Czech Republic'] = 'CZE'
countryCodes['Iran'] = 'IRN'
countryCodes['Macau'] = 'MAC'
# print(countryCodes["United Kingdom"])
# plot 3
# geo plot of the distribution of the universities in the world for the year 2015
year = 2015
def getUniCount(df : pd.DataFrame) -> pd.DataFrame:
countryCount = {}
for ind, row in df.iterrows():
cname = row["country"]
if cname in countryCount:
countryCount[cname] += 1
else:
countryCount[cname] = 1
countDf = pd.DataFrame(columns=['Country', 'Number of university', 'alpha3_code']) # create a dataframe
for key, value in countryCount.items():
appendVals = [key, value, countryCodes[key]]
countDf.loc[len(countDf)] = appendVals # append the values to the dataframe
countDf["Number of university"] = pd.to_numeric(countDf["Number of university"])
return countDf.sort_values(by=["Number of university"], ascending=False) # sort by number of university in descending order
uniData = timesUniData[timesUniData["year"] == year]
uniCountData = getUniCount(uniData)
# fig = px.scatter_geo(uniCountData, locations="alpha3_code", color="Country", hover_name="Country",
# projection="natural earth", size="Number of university" )
fig = px.choropleth(uniCountData, locations="alpha3_code", color="Number of university", hover_name="Country",
color_continuous_scale="viridis")
fig.update_layout(title = f"Geo plot of the distribution of the universities in the world for the year {year}",
title_x = .5, title_font_size = 20)
# fig.show(renderer='browser', auto_open=True)
# fig.write_image('./geo_plot.svg')
fig.show()
# plot 4
# world rank vs citations of the top 100 unviersity of year 2013, 2014, 2015
numUni = 100
# since world rankings are not given in pure int64 format we use a cheaty way of solving
# by generating rankings list for 1 to numUni
rankingList = [i for i in range(1, numUni + 1) ]
uni2013 = timesUniData[timesUniData["year"] == 2013].iloc[:numUni,]
uni2014 = timesUniData[timesUniData["year"] == 2014].iloc[:numUni,]
uni2015 = timesUniData[timesUniData["year"] == 2015].iloc[:numUni,]
print(uni2013)
scatter2013 = go.Scatter(x = rankingList, y = uni2013.citations, name = '2014',
text = uni2013.university_name, mode = 'markers', marker = dict(color = 'rgba(119, 157, 230, 0.8)' ) )
scatter2014 = go.Scatter(x = rankingList, y = uni2014.citations, name = '2014',
text = uni2014.university_name, mode = 'markers', marker = dict(color = 'rgba(82, 84, 80, .8)' ) )
scatter2015 = go.Scatter(x = rankingList, y = uni2015.citations, name = '2015',
text = uni2015.university_name, mode = 'markers', marker = dict(color = 'rgba(220, 112, 92, 0.87)' ) )
data = [scatter2013, scatter2014, scatter2015]
layout = dict(title = 'Citations vs world ranking for the top 100 univeristies of year 2013, 2014 and 2015',
legend_title = 'Year', title_x = 0.5, title_font_size = 20,
xaxis = dict(title = 'World ranking', ticklen = 20),
yaxis = dict(title = "Citations", ticklen = 20))
fig = go.Figure(data = data, layout = layout)
# fig.show(renderer='browser', auto_open=True)
fig.show()
world_rank university_name \
602 1 California Institute of Technology
603 2 Stanford University
604 2 University of Oxford
606 5 Massachusetts Institute of Technology
607 6 Princeton University
.. ... ...
712 110 University of Sheffield
713 110 University of Sussex
714 113 University of Cape Town
715 114 Eindhoven University of Technology
716 115 Maastricht University
country teaching international research citations \
602 United States of America 96.3 59.8 99.4 99.7
603 United States of America 95.0 56.6 98.8 99.3
604 United Kingdom 89.7 88.7 98.1 95.6
606 United States of America 92.9 81.6 89.2 99.9
607 United States of America 89.5 54.5 99.4 99.8
.. ... ... ... ... ...
712 United Kingdom 49.5 68.1 46.2 71.0
713 United Kingdom 34.9 76.7 39.5 91.1
714 South Africa 34.7 75.1 45.5 79.7
715 Netherlands 44.1 69.8 51.8 63.8
716 Netherlands 39.7 85.2 53.2 62.5
income total_score num_students student_staff_ratio \
602 95.6 95.5 2243.0 6.9
603 62.4 93.7 15596.0 7.8
604 79.8 93.7 19919.0 11.6
606 92.9 93.1 11074.0 9.0
607 79.5 92.7 7929.0 8.4
.. ... ... ... ...
712 41.5 56.2 23311.0 15.5
713 32.1 56.2 12001.0 17.4
714 87.3 55.8 20040.0 12.1
715 100.0 55.6 8176.0 16.0
716 99.1 55.5 15626.0 18.9
international_students female_male_ratio year
602 27.0 33 2013
603 22.0 42 2013
604 34.0 46 2013
606 33.0 37 2013
607 27.0 45 2013
.. ... ... ...
712 31.0 50 2013
713 35.0 54 2013
714 18.0 53 2013
715 14.0 19 2013
716 48.0 56 2013
[100 rows x 14 columns]